Rio

1 Exploratory Data Analysis

1.1 Data variables

listings <- vroom("http://data.insideairbnb.com/brazil/rj/rio-de-janeiro/2020-06-19/data/listings.csv.gz")
glimpse(listings)
## Rows: 35,731
## Columns: 106
## $ id                                           <dbl> 17878, 21280, 25026, 3...
## $ listing_url                                  <chr> "https://www.airbnb.co...
## $ scrape_id                                    <dbl> 2.02e+13, 2.02e+13, 2....
## $ last_scraped                                 <date> 2020-06-19, 2020-06-1...
## $ name                                         <chr> "Very Nice 2Br in Copa...
## $ summary                                      <chr> "Discounts for long te...
## $ space                                        <chr> "- Beautiful, sunny 2 ...
## $ description                                  <chr> "Discounts for long te...
## $ experiences_offered                          <chr> "none", "none", "none"...
## $ neighborhood_overview                        <chr> "This is the one of th...
## $ notes                                        <chr> NA, NA, "For any stay ...
## $ transit                                      <chr> "Excellent location. C...
## $ access                                       <chr> "The entire apartment ...
## $ interaction                                  <chr> "I will be available t...
## $ house_rules                                  <chr> "Please leave the apar...
## $ thumbnail_url                                <lgl> NA, NA, NA, NA, NA, NA...
## $ medium_url                                   <lgl> NA, NA, NA, NA, NA, NA...
## $ picture_url                                  <chr> "https://a0.muscache.c...
## $ xl_picture_url                               <lgl> NA, NA, NA, NA, NA, NA...
## $ host_id                                      <dbl> 68997, 81163, 102840, ...
## $ host_url                                     <chr> "https://www.airbnb.co...
## $ host_name                                    <chr> "Matthias", "Jules", "...
## $ host_since                                   <date> 2010-01-08, 2010-02-1...
## $ host_location                                <chr> "Rio de Janeiro, State...
## $ host_about                                   <chr> "I  am a  journalist/w...
## $ host_response_time                           <chr> "within an hour", "wit...
## $ host_response_rate                           <chr> "100%", "100%", "70%",...
## $ host_acceptance_rate                         <chr> "100%", "90%", "77%", ...
## $ host_is_superhost                            <lgl> TRUE, FALSE, FALSE, TR...
## $ host_thumbnail_url                           <chr> "https://a0.muscache.c...
## $ host_picture_url                             <chr> "https://a0.muscache.c...
## $ host_neighbourhood                           <chr> "Copacabana", "Ipanema...
## $ host_listings_count                          <dbl> 2, 0, 3, 1, 1, 1, 7, 2...
## $ host_total_listings_count                    <dbl> 2, 0, 3, 1, 1, 1, 7, 2...
## $ host_verifications                           <chr> "['email', 'phone', 'r...
## $ host_has_profile_pic                         <lgl> TRUE, TRUE, TRUE, TRUE...
## $ host_identity_verified                       <lgl> TRUE, TRUE, TRUE, TRUE...
## $ street                                       <chr> "Rio de Janeiro, Rio d...
## $ neighbourhood                                <chr> "Copacabana", "Ipanema...
## $ neighbourhood_cleansed                       <chr> "Copacabana", "Ipanema...
## $ neighbourhood_group_cleansed                 <lgl> NA, NA, NA, NA, NA, NA...
## $ city                                         <chr> "Rio de Janeiro", "Rio...
## $ state                                        <chr> "Rio de Janeiro", "RJ"...
## $ zipcode                                      <chr> "22020-050", "22420-01...
## $ market                                       <chr> "Rio De Janeiro", "Rio...
## $ smart_location                               <chr> "Rio de Janeiro, Brazi...
## $ country_code                                 <chr> "BR", "BR", "BR", "BR"...
## $ country                                      <chr> "Brazil", "Brazil", "B...
## $ latitude                                     <dbl> -23.0, -23.0, -23.0, -...
## $ longitude                                    <dbl> -43.2, -43.2, -43.2, -...
## $ is_location_exact                            <lgl> TRUE, TRUE, TRUE, TRUE...
## $ property_type                                <chr> "Condominium", "Apartm...
## $ room_type                                    <chr> "Entire home/apt", "En...
## $ accommodates                                 <dbl> 5, 6, 2, 3, 2, 2, 13, ...
## $ bathrooms                                    <dbl> 1.0, 2.0, 1.0, 1.0, 1....
## $ bedrooms                                     <dbl> 2, 2, 1, 1, 1, 1, 6, 1...
## $ beds                                         <dbl> 2, 4, 2, 1, 1, 1, 6, 1...
## $ bed_type                                     <chr> "Real Bed", "Real Bed"...
## $ amenities                                    <chr> "{TV,\"Cable TV\",Inte...
## $ square_feet                                  <dbl> NA, NA, NA, NA, NA, 0,...
## $ price                                        <chr> "$191.00", "$424.00", ...
## $ weekly_price                                 <chr> NA, "$4,945.00", NA, N...
## $ monthly_price                                <chr> NA, "$17,455.00", NA, ...
## $ security_deposit                             <chr> "$0.00", "$2,647.00", ...
## $ cleaning_fee                                 <chr> "$250.00", "$265.00", ...
## $ guests_included                              <dbl> 2, 6, 2, 2, 2, 2, 7, 1...
## $ extra_people                                 <chr> "$0.00", "$0.00", "$45...
## $ minimum_nights                               <dbl> 7, 5, 7, 2, 2, 3, 2, 3...
## $ maximum_nights                               <dbl> 180, 30, 60, 1125, 89,...
## $ minimum_minimum_nights                       <dbl> 7, 5, 7, 2, 2, 3, 2, 3...
## $ maximum_minimum_nights                       <dbl> 7, 5, 7, 2, 2, 3, 2, 3...
## $ minimum_maximum_nights                       <dbl> 1125, 30, 60, 1125, 89...
## $ maximum_maximum_nights                       <dbl> 1125, 30, 60, 1125, 89...
## $ minimum_nights_avg_ntm                       <dbl> 7.0, 5.0, 7.0, 2.0, 2....
## $ maximum_nights_avg_ntm                       <dbl> 1125, 30, 60, 1125, 89...
## $ calendar_updated                             <chr> "4 months ago", "4 mon...
## $ has_availability                             <lgl> TRUE, TRUE, TRUE, TRUE...
## $ availability_30                              <dbl> 0, 19, 0, 0, 0, 23, 27...
## $ availability_60                              <dbl> 6, 49, 0, 0, 0, 52, 54...
## $ availability_90                              <dbl> 36, 79, 0, 0, 0, 71, 8...
## $ availability_365                             <dbl> 287, 93, 154, 0, 170, ...
## $ calendar_last_scraped                        <date> 2020-06-19, 2020-06-1...
## $ number_of_reviews                            <dbl> 251, 94, 238, 282, 181...
## $ number_of_reviews_ltm                        <dbl> 20, 6, 7, 25, 23, 33, ...
## $ first_review                                 <date> 2010-07-15, 2014-02-1...
## $ last_review                                  <date> 2020-04-06, 2020-03-2...
## $ review_scores_rating                         <dbl> 93, 97, 94, 96, 94, 98...
## $ review_scores_accuracy                       <dbl> 9, 10, 9, 10, 10, 10, ...
## $ review_scores_cleanliness                    <dbl> 10, 10, 9, 10, 9, 10, ...
## $ review_scores_checkin                        <dbl> 10, 10, 9, 10, 10, 10,...
## $ review_scores_communication                  <dbl> 10, 10, 10, 10, 10, 10...
## $ review_scores_location                       <dbl> 10, 10, 10, 10, 10, 10...
## $ review_scores_value                          <dbl> 9, 10, 9, 10, 9, 10, 9...
## $ requires_license                             <lgl> FALSE, FALSE, FALSE, F...
## $ license                                      <lgl> NA, NA, NA, NA, NA, NA...
## $ jurisdiction_names                           <lgl> NA, NA, NA, NA, NA, NA...
## $ instant_bookable                             <lgl> TRUE, FALSE, FALSE, TR...
## $ is_business_travel_ready                     <lgl> FALSE, FALSE, FALSE, F...
## $ cancellation_policy                          <chr> "moderate", "strict_14...
## $ require_guest_profile_picture                <lgl> FALSE, FALSE, TRUE, FA...
## $ require_guest_phone_verification             <lgl> FALSE, FALSE, TRUE, FA...
## $ calculated_host_listings_count               <dbl> 1, 1, 3, 1, 1, 1, 5, 1...
## $ calculated_host_listings_count_entire_homes  <dbl> 1, 1, 3, 1, 1, 1, 3, 0...
## $ calculated_host_listings_count_private_rooms <dbl> 0, 0, 0, 0, 0, 0, 2, 1...
## $ calculated_host_listings_count_shared_rooms  <dbl> 0, 0, 0, 0, 0, 0, 0, 0...
## $ reviews_per_month                            <dbl> 2.08, 1.22, 1.95, 2.33...

1.2 Data wrangling

Since some price-related variables are quantitative, we need to make sure it is stored as numeric in the dataframe.

listings <- listings %>% 
  mutate(price=parse_number(price)) %>% 
  mutate(cleaning_fee=parse_number(cleaning_fee)) %>% 
  mutate(extra_people=parse_number(extra_people))

typeof(listings$price)
## [1] "double"
typeof(listings$cleaning_fee)
## [1] "double"
typeof(listings$extra_people)
## [1] "double"

Now that price, cleaning_fee, extra_people variables are numeric only.

Use skimr::skim() function to view the summary of the dataframe.

listings %>% skimr::skim()
Data summary
Name Piped data
Number of rows 35731
Number of columns 106
_______________________
Column type frequency:
character 43
Date 5
logical 16
numeric 42
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
listing_url 0 1.00 34 37 0 35731 0
name 58 1.00 1 255 0 34488 0
summary 2588 0.93 1 1000 0 31747 0
space 14186 0.60 1 1000 0 20709 0
description 1433 0.96 1 1000 0 33409 0
experiences_offered 0 1.00 4 4 0 1 0
neighborhood_overview 15776 0.56 1 1000 0 18303 0
notes 24127 0.32 1 1000 0 10380 0
transit 15967 0.55 1 1000 0 18237 0
access 19597 0.45 1 1000 0 14325 0
interaction 18251 0.49 1 1000 0 15456 0
house_rules 17304 0.52 1 1000 0 16288 0
picture_url 0 1.00 81 146 0 35146 0
host_url 0 1.00 39 43 0 24992 0
host_name 5 1.00 1 35 0 6429 0
host_location 168 1.00 2 253 0 1180 0
host_about 18793 0.47 1 8586 0 9714 22
host_response_time 5 1.00 3 18 0 5 0
host_response_rate 5 1.00 2 4 0 63 0
host_acceptance_rate 5 1.00 2 4 0 100 0
host_thumbnail_url 5 1.00 55 106 0 24859 0
host_picture_url 5 1.00 57 109 0 24859 0
host_neighbourhood 13638 0.62 3 33 0 164 0
host_verifications 0 1.00 2 158 0 345 0
street 0 1.00 10 112 0 502 0
neighbourhood 1892 0.95 3 24 0 101 0
neighbourhood_cleansed 0 1.00 3 24 0 156 0
city 162 1.00 1 69 0 317 0
state 35 1.00 1 51 0 52 0
zipcode 1752 0.95 1 24 0 4261 1
market 43 1.00 4 21 0 11 0
smart_location 0 1.00 9 77 0 341 0
country_code 0 1.00 2 2 0 1 0
country 0 1.00 6 6 0 1 0
property_type 0 1.00 3 22 0 36 0
room_type 0 1.00 10 15 0 4 0
bed_type 0 1.00 5 13 0 5 0
amenities 0 1.00 2 1267 0 33605 0
weekly_price 33335 0.07 7 11 0 758 0
monthly_price 33070 0.07 7 11 0 861 0
security_deposit 15680 0.56 5 10 0 737 0
calendar_updated 0 1.00 5 14 0 90 0
cancellation_policy 0 1.00 6 27 0 6 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
last_scraped 0 1.00 2020-06-19 2020-06-21 2020-06-19 3
host_since 5 1.00 2009-03-29 2020-06-17 2016-01-26 3301
calendar_last_scraped 0 1.00 2020-06-19 2020-06-21 2020-06-19 3
first_review 14991 0.58 2010-06-07 2020-06-18 2018-02-14 2506
last_review 14991 0.58 2012-02-21 2020-06-19 2020-01-31 1529

Variable type: logical

skim_variable n_missing complete_rate mean count
thumbnail_url 35731 0 NaN :
medium_url 35731 0 NaN :
xl_picture_url 35731 0 NaN :
host_is_superhost 5 1 0.13 FAL: 31028, TRU: 4698
host_has_profile_pic 5 1 1.00 TRU: 35588, FAL: 138
host_identity_verified 5 1 0.27 FAL: 26186, TRU: 9540
neighbourhood_group_cleansed 35731 0 NaN :
is_location_exact 0 1 0.69 TRU: 24496, FAL: 11235
has_availability 0 1 1.00 TRU: 35731
requires_license 0 1 0.00 FAL: 35731
license 35731 0 NaN :
jurisdiction_names 35731 0 NaN :
instant_bookable 0 1 0.40 FAL: 21360, TRU: 14371
is_business_travel_ready 0 1 0.00 FAL: 35731
require_guest_profile_picture 0 1 0.02 FAL: 35176, TRU: 555
require_guest_phone_verification 0 1 0.02 FAL: 35177, TRU: 554

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 2.13e+07 1.37e+07 1.79e+04 1.10e+07 1.71e+07 3.43e+07 4.38e+07 ▅▇▃▃▆
scrape_id 0 1.00 2.02e+13 0.00e+00 2.02e+13 2.02e+13 2.02e+13 2.02e+13 2.02e+13 ▁▁▇▁▁
host_id 0 1.00 8.50e+07 8.94e+07 1.17e+04 1.50e+07 5.62e+07 1.12e+08 3.51e+08 ▇▃▁▁▁
host_listings_count 5 1.00 3.58e+01 3.12e+02 0.00e+00 1.00e+00 1.00e+00 3.00e+00 3.32e+03 ▇▁▁▁▁
host_total_listings_count 5 1.00 3.58e+01 3.12e+02 0.00e+00 1.00e+00 1.00e+00 3.00e+00 3.32e+03 ▇▁▁▁▁
latitude 0 1.00 -2.30e+01 3.00e-02 -2.31e+01 -2.30e+01 -2.30e+01 -2.29e+01 -2.28e+01 ▁▇▃▁▁
longitude 0 1.00 -4.32e+01 1.00e-01 -4.37e+01 -4.33e+01 -4.32e+01 -4.32e+01 -4.31e+01 ▁▁▂▂▇
accommodates 0 1.00 4.18e+00 2.63e+00 1.00e+00 2.00e+00 4.00e+00 5.00e+00 1.60e+02 ▇▁▁▁▁
bathrooms 67 1.00 1.67e+00 1.05e+00 0.00e+00 1.00e+00 1.00e+00 2.00e+00 2.00e+01 ▇▁▁▁▁
bedrooms 79 1.00 1.62e+00 1.09e+00 0.00e+00 1.00e+00 1.00e+00 2.00e+00 3.00e+01 ▇▁▁▁▁
beds 331 0.99 2.54e+00 2.08e+00 0.00e+00 1.00e+00 2.00e+00 3.00e+00 6.20e+01 ▇▁▁▁▁
square_feet 35279 0.01 5.25e+02 9.83e+02 0.00e+00 0.00e+00 1.94e+02 7.53e+02 1.08e+04 ▇▁▁▁▁
price 0 1.00 7.86e+02 2.38e+03 0.00e+00 1.55e+02 3.00e+02 6.42e+02 1.32e+05 ▇▁▁▁▁
cleaning_fee 11584 0.68 1.64e+02 1.94e+02 0.00e+00 8.90e+01 1.50e+02 2.00e+02 7.41e+03 ▇▁▁▁▁
guests_included 0 1.00 1.73e+00 1.60e+00 1.00e+00 1.00e+00 1.00e+00 2.00e+00 3.60e+01 ▇▁▁▁▁
extra_people 0 1.00 4.24e+01 1.11e+02 0.00e+00 0.00e+00 0.00e+00 5.00e+01 5.55e+03 ▇▁▁▁▁
minimum_nights 0 1.00 4.91e+00 2.23e+01 1.00e+00 1.00e+00 2.00e+00 4.00e+00 1.12e+03 ▇▁▁▁▁
maximum_nights 0 1.00 2.87e+04 5.29e+06 1.00e+00 3.00e+01 1.12e+03 1.12e+03 1.00e+09 ▇▁▁▁▁
minimum_minimum_nights 0 1.00 4.78e+00 2.12e+01 1.00e+00 1.00e+00 2.00e+00 4.00e+00 1.10e+03 ▇▁▁▁▁
maximum_minimum_nights 0 1.00 5.12e+00 2.14e+01 1.00e+00 1.00e+00 3.00e+00 5.00e+00 1.10e+03 ▇▁▁▁▁
minimum_maximum_nights 0 1.00 2.87e+04 5.29e+06 1.00e+00 4.00e+01 1.12e+03 1.12e+03 1.00e+09 ▇▁▁▁▁
maximum_maximum_nights 0 1.00 2.87e+04 5.29e+06 1.00e+00 4.40e+01 1.12e+03 1.12e+03 1.00e+09 ▇▁▁▁▁
minimum_nights_avg_ntm 0 1.00 4.89e+00 2.13e+01 1.00e+00 1.00e+00 2.10e+00 4.00e+00 1.10e+03 ▇▁▁▁▁
maximum_nights_avg_ntm 0 1.00 2.87e+04 5.29e+06 1.00e+00 4.35e+01 1.12e+03 1.12e+03 1.00e+09 ▇▁▁▁▁
availability_30 0 1.00 1.65e+01 1.39e+01 0.00e+00 0.00e+00 2.70e+01 3.00e+01 3.00e+01 ▆▁▁▁▇
availability_60 0 1.00 3.44e+01 2.79e+01 0.00e+00 0.00e+00 5.40e+01 5.90e+01 6.00e+01 ▆▁▁▁▇
availability_90 0 1.00 5.25e+01 4.18e+01 0.00e+00 0.00e+00 8.30e+01 8.90e+01 9.00e+01 ▆▁▁▁▇
availability_365 0 1.00 1.72e+02 1.55e+02 0.00e+00 0.00e+00 1.68e+02 3.62e+02 3.65e+02 ▇▃▂▁▇
number_of_reviews 0 1.00 9.63e+00 2.56e+01 0.00e+00 0.00e+00 1.00e+00 6.00e+00 4.06e+02 ▇▁▁▁▁
number_of_reviews_ltm 0 1.00 3.06e+00 6.93e+00 0.00e+00 0.00e+00 0.00e+00 2.00e+00 1.05e+02 ▇▁▁▁▁
review_scores_rating 15873 0.56 9.46e+01 9.58e+00 2.00e+01 9.30e+01 9.80e+01 1.00e+02 1.00e+02 ▁▁▁▁▇
review_scores_accuracy 15892 0.56 9.64e+00 9.20e-01 2.00e+00 1.00e+01 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_cleanliness 15887 0.56 9.39e+00 1.13e+00 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_checkin 15893 0.56 9.81e+00 7.20e-01 2.00e+00 1.00e+01 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_communication 15885 0.56 9.76e+00 8.10e-01 2.00e+00 1.00e+01 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_location 15891 0.56 9.75e+00 7.40e-01 2.00e+00 1.00e+01 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
review_scores_value 15889 0.56 9.29e+00 1.06e+00 2.00e+00 9.00e+00 1.00e+01 1.00e+01 1.00e+01 ▁▁▁▁▇
calculated_host_listings_count 0 1.00 8.07e+00 3.42e+01 1.00e+00 1.00e+00 1.00e+00 3.00e+00 3.19e+02 ▇▁▁▁▁
calculated_host_listings_count_entire_homes 0 1.00 7.20e+00 3.39e+01 0.00e+00 1.00e+00 1.00e+00 2.00e+00 3.17e+02 ▇▁▁▁▁
calculated_host_listings_count_private_rooms 0 1.00 7.50e-01 1.59e+00 0.00e+00 0.00e+00 0.00e+00 1.00e+00 2.00e+01 ▇▁▁▁▁
calculated_host_listings_count_shared_rooms 0 1.00 9.00e-02 6.50e-01 0.00e+00 0.00e+00 0.00e+00 0.00e+00 1.10e+01 ▇▁▁▁▁
reviews_per_month 14991 0.58 5.50e-01 7.30e-01 1.00e-02 1.10e-01 2.60e-01 6.80e-01 8.55e+00 ▇▁▁▁▁

There are 106 variables and 35,731 observations.

The cleaning fee numeric variable has 11,584 missing observations, however this is not an issue as some hosts prefer including the cleaning fee in their listed price rather than displaying it separately. In fact, we think that AirBnb clients feel like they are striking a happy medium when they don’t see any separate price to the main listed price. Moreover, clients could well be cleaner and more respectful of the accommodation when they don’t see that they are paying for a cleaning fee.

Other variables such as Access, Transit or House_rules also have more than 10,000 missing values. However once again, this is probably due to the host discretionary marketing decision, and these instructions are probably either obvious or included in other parts of the listing information rather than randomly missing.

1.3 Handling missing values (NAs)

Next, replace the missing values of cleaning_fee with an numeric 0.

listings <- listings %>%
  mutate(cleaning_fee = case_when(
    is.na(cleaning_fee) ~ 0, 
    TRUE ~ cleaning_fee
  ))
missing = listings %>% skimr::skim() %>% filter(skim_variable == 'cleaning_fee') %>% select(n_missing)
missing$n_missing
## [1] 0

Now there are no longer any missing values of cleaning_fee.

Next, we look at the variable property_type and use the count() function to determine how many categories there are and their frequency. Specifically, we like to know:

  • What are the top 4 most common property types?
  • What proportion of the total listings do they make up?
property_type <- listings %>%
  group_by(property_type) %>% 
  summarise(n=n()) %>%
  arrange(desc(n)) %>%
  mutate(property_type, `Percent (%)`=round((n / nrow(listings)) * 100, 1))

property_type

Counting the different property types shows that the top 4 are Apartment, House, Condominium and Loft.

The top 4 property types thus account for 94.6% of the total listings, hence our decision to assign the other property types to the ‘Other’ category.

listings <- listings %>%
  mutate(prop_type_simplified = case_when(
    property_type %in% c("Apartment","House", "Condominium","Loft") ~ property_type, 
    TRUE ~ "Other"))

prop_type_simplified  <- listings %>% 
  group_by(prop_type_simplified ) %>% 
  summarise(n=n()) %>%
  arrange(desc(n)) %>%
  mutate(prop_type_simplified, `Percent (%)`=round((n / nrow(listings)) * 100, 1))

prop_type_simplified

Airbnb is most commonly used for travel purposes, i.e., as an alternative to traditional hotels. We only want to include listings in our regression analysis that are intended for travel purposes:

min_stay <- listings %>% 
  count(minimum_nights) %>% 
  arrange(desc(n))
min_stay <- mutate(min_stay, `Percent (%)`=round((n / nrow(listings)) * 100, 3))
min_stay

The most common value for minimum stay requirement is 1 night, with 2, 3, 5 and 4 nights following closely.

Some values seem very high, such as 1123 or 930, as they span longer than one year. This could be due to listing errors, scams or simply the host having put its flat on hold for an undefined period of time as it is unexpected for a guest to book a flat for such a long period of time.

For later analysis, we shall filter the airbnb data so that it only includes observations with minimum_nights <= 4.

2 Mapping

An overview of the spatial distribution of AirBnB rentals whose minimum_nights is less than equal to four (4).

leaflet(data = filter(listings, minimum_nights <= 4)) %>% 
  addProviderTiles("OpenStreetMap.Mapnik") %>% 
  addCircleMarkers(lng = ~longitude, 
                   lat = ~latitude, 
                   radius = 1, 
                   fillColor = "blue", 
                   fillOpacity = 0.4, 
                   popup = ~listing_url,
                   label = ~property_type)

3 Regression Analysis

Create a new variable called price_4_nights that uses price, cleaning_fee, guests_included, and extra_people to calculate the total cost for two people to stay at the Airbnb property for 4 nights. This is the variable we want to explain.

In addition, we filter out those accommodating less than 2 guests, minimum_nights more than 4 nights, maximum_nights less than 4.

listings <- listings %>%
  filter(accommodates >=2 & minimum_nights <=4 & maximum_nights >= 4 & guests_included <= 2 & !is.na(number_of_reviews) &       !is.na(review_scores_rating)) %>%
  mutate(price_4_nights=ifelse(guests_included==2, 4*(price+cleaning_fee), 4*(price+cleaning_fee+extra_people))) %>%
  mutate(price_4_nights_log=log10(price_4_nights))

Use histograms or density plots to examine the distributions of price_4_nights and log(price_4_nights).

g1=ggplot(listings, aes(price_4_nights, fill=prop_type_simplified)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + ggtitle("Density of Cost") +
  geom_histogram(stat="density")

g2=ggplot(listings, aes(price_4_nights, fill=prop_type_simplified)) +
  scale_x_continuous(breaks=c(50,100,200,300,500,1000,3000,8000,20000,40000), trans="log10") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + ggtitle("Density of Cost (log10)") +
  geom_histogram(stat="density")

plot_list <- list(g1,g2) 

ggm_density <- ggmatrix(
 plots = plot_list,
 nrow = 1, ncol = 2,
 xAxisLabels = c("price_4_nights", "log10(price_4_nights)"),
 yAxisLabels = c("Density"),
 byrow = FALSE,
 title = "",
 legend = 1
)
ggm_density

As we can see that price_4_nights is extremely skewed to left side, due to some outliers with extremely high prices.

Even though using log(price_4_nights) removes the skew issue, it will also remove many details in analysis.

Since the skew issue is due to extreme outliers, we should be able to take 95% quantile to remove the outliers.

Next, do charting again with new data set.

price_max = quantile(listings$price_4_nights, c(0.95))[[1]]
listings <- listings %>%
  filter(price_4_nights <= price_max)

ggplot(listings, aes(price_4_nights, fill=prop_type_simplified)) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + ggtitle("Density of Cost") +
  geom_histogram(stat="density")

As we can see from the above chart, quantile of 95% will give much better distribution. We will use the adjusted data set for analysis below.

3.1 Model 1

Per project assignment, we fit a regression model called model1 with the following explanatory variables: prop_type_simplified, number_of_reviews, and review_scores_rating.

model1 <- lm(price_4_nights ~ prop_type_simplified + number_of_reviews + review_scores_rating, data=listings)
model1 %>% broom::tidy()

From the summary:

  1. prop_type_simplified is a significant predictor to price_4_nights. Apartment are most expensive, followed by House and Condominium; This is in line with our experience and expectation;

  2. On the other hands, review_scores_rating has little relationship to price_4_nights;

  3. Lastly, number_of_reviews has even less relationship to price_4_nights.

3.2 Model 2 (Model1 + room_type)

We want to determine if room_type is a significant predictor of the cost for 4 nights, given everything else in the model.

Now Create a new regression model called model2 that includes all of the explanantory variables in model1 plus room_type.

model2 <- lm(price_4_nights ~ prop_type_simplified + number_of_reviews + review_scores_rating + room_type, data = listings)
model2 %>% broom::tidy()

From the summary, we can tell:

  1. room_type is a significant predictor of price_4_nights. Entire home/appt are most expensive, followed by Hotel room, Private room, and Shared room is cheapest.

We can use ggpairs2 to get more insight about these variables related to each other.

ggpairs2 <- listings %>% 
  select(price_4_nights, number_of_reviews, review_scores_rating, prop_type_simplified, room_type) %>%
  GGally::ggpairs(mapping = ggplot2::aes(colour=room_type), legend = 1) +
  theme(legend.position = "bottom")

ggpairs2

From the chart, we can tell:

  1. review_scores_rating has a low coefficient -0.101 to price_4_nights; this is consistent to the model summary above. As we can see from the chart, listings within low to medium price range receive most review score feedback and more higher rating than rentals with high prices; However, it could be a result of higher consumer expectation, as the higher price they paid, the higher quality they would expect from the rentals.

  2. number_of_reviews, given a very low coefficient 0.013, has even less relationship to price_4_nights. The distribution of number_of_reviews looks similar to that of price_4_nights. So it seems more a result of the number of visited customers.

  3. Hotel room has least count; this mostly because of the nature of Airbnb’s business, which is more for private properties, and less for hotels.

3.3 Diagnostics, collinearity, summary tables

3.3.1 Check the residuals, using autoplot

#install.packages("ggfortify")
library(ggfortify)
autoplot(model1)

autoplot(model2)

3.3.2 colinearity

Run car::vif() to check about colinearity

#install.packages("car")
car::vif(model1)
##                      GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.01  4            1.00
## number_of_reviews    1.01  1            1.01
## review_scores_rating 1.00  1            1.00
#install.packages("car")
car::vif(model2)
##                      GVIF Df GVIF^(1/(2*Df))
## prop_type_simplified 1.23  4            1.03
## number_of_reviews    1.02  1            1.01
## review_scores_rating 1.01  1            1.00
## room_type            1.24  3            1.04

For both models the variables are very little correlated.

3.3.3 Summary model Comparison between model1 and model2

Run huxtable::huxreg to compare the two models

#install.packages("huxreg")
huxtable::huxreg(model1, model2,
                 statistics = c('#observations' = 'nobs', 
                                'R squared' = 'r.squared', 
                                'Adj. R Squared' = 'adj.r.squared', 
                                'Residual SE' = 'sigma'), 
                 bold_signif = 0.05
) %>% 
  huxtable::set_caption('Comparison of models')
Comparison of models
(1)(2)
(Intercept)1428.616 ***1495.232 ***
(78.514)   (72.898)   
prop_type_simplifiedCondominium29.318    47.740    
(31.758)   (29.478)   
prop_type_simplifiedHouse-481.647 ***-141.544 ***
(29.101)   (28.154)   
prop_type_simplifiedLoft-144.891 ** -299.894 ***
(46.391)   (43.194)   
prop_type_simplifiedOther-313.479 ***-171.544 ***
(31.286)   (30.664)   
number_of_reviews-2.767 ***-3.437 ***
(0.229)   (0.213)   
review_scores_rating1.470    3.078 ***
(0.824)   (0.766)   
room_typeHotel room        -551.811 ***
        (92.440)   
room_typePrivate room        -704.118 ***
        (16.315)   
room_typeShared room        -807.047 ***
        (57.379)   
#observations12016        12016        
R squared0.039    0.173    
Adj. R Squared0.039    0.172    
Residual SE852.204    790.807    
*** p < 0.001; ** p < 0.01; * p < 0.05.

model2 is the better model. We use broom::tidy() and broom::glance() to get more about the model.

model2 %>% broom::tidy()
## # A tibble: 10 x 5
##    term                            estimate std.error statistic  p.value
##    <chr>                              <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)                      1495.      72.9       20.5  6.37e-92
##  2 prop_type_simplifiedCondominium    47.7     29.5        1.62 1.05e- 1
##  3 prop_type_simplifiedHouse        -142.      28.2       -5.03 5.04e- 7
##  4 prop_type_simplifiedLoft         -300.      43.2       -6.94 4.04e-12
##  5 prop_type_simplifiedOther        -172.      30.7       -5.59 2.26e- 8
##  6 number_of_reviews                  -3.44     0.213    -16.1  5.02e-58
##  7 review_scores_rating                3.08     0.766      4.02 5.87e- 5
##  8 room_typeHotel room              -552.      92.4       -5.97 2.45e- 9
##  9 room_typePrivate room            -704.      16.3      -43.2  0.      
## 10 room_typeShared room             -807.      57.4      -14.1  1.40e-44
model2 %>% broom::glance()
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic p.value    df  logLik    AIC    BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>   <dbl>  <dbl>  <dbl>
## 1     0.173         0.172  791.      279.       0     9 -97228. 1.94e5 1.95e5
## # ... with 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>

3.3.4 Prediction from the model

The equation is:

price_4_nights = 1495.23 + 
                 47.74*(prop_type_simplifiedCondominium) +
                 -141.54*(prop_type_simplifiedHouse) +
                 -299.89*(prop_type_simplifiedLoft) +
                 -171.54*(prop_type_simplifiedOther) + 
                 -3.44*(number_of_reviews) +
                 3.08*(review_scores_rating) +
                 -551.81(room_typeHotel room) +
                 -704.12*(room_typePrivate room) +
                 -807.05*(room_typeShared room)

Add real test here!

3.4 Further variables/questions to explore

We want to know which variable should we use for the regression model? and why?

Now we create correlation matrix for as many variables as possible, in order to find out those matter most w.r.t price_4_nights.

We then figure how are AirBnB prices distributed with those factors, decide which variable should you use for the regression model.

Some interested variables are in character format, so we change them to numeric for analysis later.

listings <- listings %>%
  mutate(host_response_rate=parse_number(host_response_rate)) %>%
  mutate(host_acceptance_rate=parse_number(host_acceptance_rate)) %>%
  mutate(security_deposit=parse_number(security_deposit))
# select interested variables 
corr_listings <- listings %>%
  filter(!is.na(host_response_rate) & !is.na(host_acceptance_rate)) %>%  
  select(price_4_nights,
         host_response_rate, 
         host_acceptance_rate, 
         host_listings_count, 
         accommodates, 
         bedrooms, 
         beds, 
         bathrooms, 
         guests_included, 
         number_of_reviews, 
         number_of_reviews_ltm, 
         reviews_per_month,
         review_scores_rating,
         review_scores_checkin,
         review_scores_cleanliness,
         review_scores_accuracy,
         review_scores_communication,
         review_scores_location,
         review_scores_value 
         )


names(corr_listings)
##  [1] "price_4_nights"              "host_response_rate"         
##  [3] "host_acceptance_rate"        "host_listings_count"        
##  [5] "accommodates"                "bedrooms"                   
##  [7] "beds"                        "bathrooms"                  
##  [9] "guests_included"             "number_of_reviews"          
## [11] "number_of_reviews_ltm"       "reviews_per_month"          
## [13] "review_scores_rating"        "review_scores_checkin"      
## [15] "review_scores_cleanliness"   "review_scores_accuracy"     
## [17] "review_scores_communication" "review_scores_location"     
## [19] "review_scores_value"
M = cor(corr_listings, use="complete.obs")
corrplot(M)

What we can tell from the matrix above:

  • accommodates, bedrooms, bathrooms, beds are the most significant positive predictors of price_4_nights;
  • host_listings_count has some possible impact to price_4_nights but less significant;
  • reviews_per_month, number_of_reviews, number_of_reviews_ltm negative related to price_4_nights;
  • review_scores_rating has insignificant impact to price_4_nights, which quite contraindicate common sense;
  • review-related variables (accuracy, communication, cleanliness, checkin) are highly correlated with each other. This is not surprising, as guest’s feedback on one apect could affect others.

3.4.1 Are the number of bathrooms, bedrooms, beds, or size of the house (accommodate) significant predictors of price_4_nights?

Yes

3.4.2 Do superhosts (host_is_superhost) command a pricing premium, after controlling for other variables?

Airbnb tags hosts, who are experienced hosts who provide a shining example for other hosts, and extraordinary experiences for their guests, with a badge that will automatically appear on their listing and profile to help customers identify them.

To find out whether it command a pricing premium, we control variables in a way more in line with travel purposes.

bedroom <= 2, bathroom > 1, beds >= 2
listings_superhost = listings %>% 
  filter(bedrooms < 2 & bathrooms > 1 & beds >= 2) 
model_superhost <- lm(price_4_nights ~ host_is_superhost, data = listings_superhost)
model_superhost %>% broom::tidy()
## # A tibble: 2 x 5
##   term                  estimate std.error statistic   p.value
##   <chr>                    <dbl>     <dbl>     <dbl>     <dbl>
## 1 (Intercept)              1311.      34.3     38.2  1.99e-172
## 2 host_is_superhostTRUE    -151.      73.3     -2.06 4.02e-  2
  1. Most owners advertise the exact location of their listing (is_location_exact == TRUE), while a non-trivial proportion don’t. After controlling for other variables, is a listing’s exact location a significant predictor of price_4_nights?

  2. For all cities, there are 3 variables that relate to neighbourhoods: neighbourhood, neighbourhood_cleansed, and neighbourhood_group_cleansed. There are typically more than 20 neighbourhoods in each city, and it would’t make sense to include them all in your model. Use your city knowledge, or ask someone with city knowledge, and see whether you can group neighbourhoods together so the majority of listings falls in fewer (5-6 max) geographical areas. You would thus need to create a new categorical variabale neighbourhood_simplified and determine whether location is a predictor of price_4_nights

  3. What is the effect of cancellation_policy on price_4_nights, after we control for other variables?

3.4.3 Finding predictors

3.4.4 Controlling factor variables

We decide on choosing a few explanatory variables that have low numbers of n_unique values for our factor variables:

  • neighbourhood
  • Property_type

others that are likely to influence price:

  • host_is_superhost
  • Review_scores_cleanliness
  • Review_scores_location
  • minimum_nights
  • square_feet
  • bedrooms
  • bathrooms

Now we will analyze how neighbourhood related to price.

3.4.5 neighbourhood

Looking at the map, properties of outskirts of Rio seems reduce much.

neighbour_list <- listings %>% 
  filter(!is.na(neighbourhood)) %>%
  group_by(neighbourhood) %>% 
  summarise(n=n(), prop_mean_price=mean(price_4_nights))  %>% 
  arrange(desc(n)) %>%
  mutate(`Percent (%)`=round((n / nrow(listings)) * 100, 1))
neighbour_list
## # A tibble: 78 x 4
##    neighbourhood                n prop_mean_price `Percent (%)`
##    <chr>                    <int>           <dbl>         <dbl>
##  1 Copacabana                3540           1475.          29.5
##  2 Barra da Tijuca           1812           1626.          15.1
##  3 Ipanema                   1103           1910.           9.2
##  4 Botafogo                   661           1291.           5.5
##  5 Leblon                     567           2026.           4.7
##  6 Santa Teresa               483           1076.           4  
##  7 Recreio dos Bandeirantes   480           1499.           4  
##  8 Flamengo                   357           1245.           3  
##  9 Centro                     239           1008.           2  
## 10 Leme                       238           1433.           2  
## # ... with 68 more rows

After removing missing observations and keeping only the 13 main neighbourhoods of Rio, namely the ones displaying more than 200 Air Bnb listings, we get a sample of 10,694 listings from which to start our analysis. This corresponds to a proportion of around 85% of the population for which there are no NA values and is therefore a good basis for our analysis.

main_neighbourhood <- listings %>% group_by(neighbourhood) %>% tally() %>% filter(!is.na(neighbourhood) & n > 200)
main_neighbourhood
## # A tibble: 12 x 2
##    neighbourhood                n
##    <chr>                    <int>
##  1 Barra da Tijuca           1812
##  2 Botafogo                   661
##  3 Centro                     239
##  4 Copacabana                3540
##  5 Flamengo                   357
##  6 Ipanema                   1103
##  7 Lapa                       222
##  8 Laranjeiras                235
##  9 Leblon                     567
## 10 Leme                       238
## 11 Recreio dos Bandeirantes   480
## 12 Santa Teresa               483
sum(main_neighbourhood$n)
## [1] 9937
# filter main neighbourhoods
listings_nb <- listings %>% filter(neighbourhood %in% c(main_neighbourhood$neighbourhood))

leaflet(data = listings_nb) %>% 
  addProviderTiles("OpenStreetMap.Mapnik") %>% 
  addCircleMarkers(lng = ~longitude, 
                   lat = ~latitude, 
                   radius = 1, 
                   fillColor = "blue", 
                   fillOpacity = 0.4, 
                   popup = ~listing_url,
                   label = ~property_type)

> We then look at how neighbourhoods related to price.

ggplot(listings_nb, aes(x = price_4_nights, fill = neighbourhood )) + 
   ggtitle("Count of Price by neighbourhood") +
   geom_histogram(alpha = 0.5, position = "identity")

 ggplot(listings_nb, aes(neighbourhood, price_4_nights, fill = neighbourhood)) +
   theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) + 
   ggtitle("Price Span per Neighbourhood") +
   geom_boxplot()

Copacabana has largest count of properties and widest price span; while Ipanema and Leblon have higher mean prices.

These are quite in line with our impression, as the three area are most popular for tourists.